library(rmapshaper)
library(censusapi)
library(shapefiles)
library(base)
library(tidyverse)
library(tigris)
library(sf)
library(mapview)
library(raster)
library(rlang)
library(rgeos)
library(data.table)
library(measurements)
library(smoothr)
library(lwgeom)
library(units)
library(geosphere)
library(kableExtra)
library(leaflet)
mapviewOptions(
basemaps = "OpenStreetMap"
)
options(
tigris_class = "sf",
tigris_use_cache = TRUE
)
Sys.setenv(CENSUS_KEY="b88495f8315f45b2d100dee9ba8f4c489a7371c2")
projection <- "+proj=utm +zone=10 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=ft +no_defs"
`%notin%` <- Negate(`%in%`)
Does the location of where one lives impact the ability to enjoy outdoor green space? Does the size of the yard at your house play a factor in whether you regularly visit parks? Are communities under-visiting parks that are physically closer to them? The goal of this analysis is to provide insight on park accessibility within communities in the City of San Jose. We will consider factors such as backyard size, park visit frequency, and distance from a park to determine a community’s park accessibility.
The first few sections describe our methodological approaches for preparing the factors for analysis, followed by sections detailing the comparisons between factors and the resulting conclusions.
The following describes how the Safegraph data is processed:
Safegraph collects device visit information to “places of interest” (POI) like parks or grocery stores and includes some information about a device’s origin census block group. The visit counts in the Safegraph dataset fall into either one of these three cases:
Note: We obtained all of the data for the park visits portion of the analysis from Safegraph Patterns Data.
Our goal for this section is to obtain the average residential yard size for each census block group in San Jose. We start with parcel data from the Santa Clara County Assessor’s Office, and filter it down to only include residential zoned parcels. The filtering criteria is based on property use codes that are classified as residential. The following table provides more detail:
puc_table <- data.frame("Use_Code" = c(1,2,3,4,6,7), "Description" = c("Single Family", "Two Family", "Three and Four Family","Five or More Family","Condominium, Townhouse","Fraternity, Sorority, Boarding, Rooming House"))
kable(puc_table)
| Use_Code | Description |
|---|---|
| 1 | Single Family |
| 2 | Two Family |
| 3 | Three and Four Family |
| 4 | Five or More Family |
| 6 | Condominium, Townhouse |
| 7 | Fraternity, Sorority, Boarding, Rooming House |
After filtering to parcels with these residential use codes, we spatially match the building footprint shape associated with that parcel. To get from parcel area to yard area, we subtract the building footprint shape out of the parcel shape.
#load("P:/SFBI/Data Library/OSM/osm_bldg.Rdata")
#
#osm_bldg <-
# osm_bldg %>% st_transform(projection)
sj_boundary <-
places("CA", cb=FALSE) %>%
filter(NAME == "San Jose") %>%
st_transform(projection)
#
sj_blockgroup <-
readRDS("sj_blockgroups.rds") %>%
st_transform(projection)
#mapview(sj_blockgroup)
#load("P:/SFBI/Data Library/California Blocks/ca_blocks_lite.Rdata")
#ca_blocks_lite <- ca_blocks_lite %>% st_transform(projection)
#sj_blocks <- ca_blocks_lite[sj_boundary,]
#save(sj_blocks, file = "sj_blocks.Rdata")
#load("sj_blocks.Rdata")
#mapview(sj_blocks[100,])
# sj_bldg <-
# osm_bldg[sj_boundary,] %>%
# transmute(
# index = row_number()
# )
#save(sj_bldg, file = "sj_bldg.Rdata")
load("sj_bldg.Rdata")
# scc_parcels <- st_read("P:/SFBI/Data Library/Parcels/santa clara/scc.shp")
#
# scc_parcels <-
# scc_parcels %>%
# st_transform(projection)
#
# This includes parcels parcels at the periphery of San Jose. This is important for the "block method" used to find street edges. otherwise this sf object isn't used.
# sj_parcels_plus <-
# scc_parcels[sj_blocks,] %>%
# dplyr::select(APN, USE_CODE, SITUS_CITY)
# #
# sj_parcels <-
# sj_parcels_plus %>%
# filter(SITUS_CITY == "SAN JOSE")
# #
# save(sj_parcels_plus, sj_parcels, file = "sj_parcels.Rdata")
load("sj_parcels.Rdata")
# apn_bg_lookup <- data.frame(APN = NA, origin_census_block_group = NA)
#
# for(i in 1:nrow(sj_blockgroup)){
# if(i %% 10 == 0){print(i)}
# temp <- sj_blockgroup[i,]
# temp2 <- sj_parcels[temp,] %>%
# dplyr::select(APN) %>%
# st_set_geometry(NULL) %>%
# mutate(
# origin_census_block_group = temp$origin_census_block_group
# )
# apn_bg_lookup <-
# apn_bg_lookup %>%
# rbind(temp2)
# }
#
# apn_bg_lookup <-
# apn_bg_lookup %>%
# na.omit()
#
# saveRDS(apn_bg_lookup,"sj_apn_bg_lookup.rds")
apn_bg_lookup <- readRDS("sj_apn_bg_lookup.rds")
sj_parcels_residential <-
sj_parcels %>%
mutate(
USE_CODE = as.numeric(USE_CODE)
) %>%
filter(USE_CODE %in% c(1,2,3,4,6,7))
#mapview(head(sj_parcels_residential,100))
matched_bldg_footprint <-
sj_bldg %>%
st_centroid() %>%
st_intersection(sj_parcels_residential) %>%
st_set_geometry(NULL) %>%
left_join(sj_bldg, by = "index") %>%
st_as_sf()
#
#
residential_parcels <-
sj_parcels_residential %>%
mutate(
parcel_area = st_area(.) %>% as.numeric()
) %>%
as.data.frame() %>%
st_as_sf() %>%
rename(parcel_geometry = geometry)
#
#
residential_bldg <-
residential_parcels %>%
as.data.frame() %>%
right_join(
matched_bldg_footprint %>% dplyr::select(APN),
by = "APN"
) %>%
filter(!is.na(parcel_area)) %>%
rename(building_geometry = geometry) %>%
st_as_sf() %>%
st_set_geometry("building_geometry")
#
#
residential_parcels <-
residential_bldg %>%
st_set_geometry("parcel_geometry")
#
residential_parcels <- residential_parcels[!duplicated(residential_parcels), ]
##############################################################
# available_yard <-
# st_difference(
# residential_parcels[1:10000,],
# st_union(
# residential_parcels$building_geometry[1:10000,])
# )
#
# available_yard2 <-
# st_difference(
# residential_parcels[10001:20000,],
# st_union(
# residential_parcels$building_geometry[10001:20000,])
# )
#
# available_yard3 <-
# st_difference(
# residential_parcels[20001:30000,],
# st_union(
# residential_parcels$building_geometry[20001:30000,])
# )
# #
# available_yard4 <-
# st_difference(
# residential_parcels[30001:40000,],
# st_union(
# residential_parcels$building_geometry[30001:40000,])
# )
#
# available_yard5 <-
# st_difference(
# residential_parcels[40001:50000,],
# st_union(
# residential_parcels$building_geometry[40001:50000,])
# )
#
# available_yard6 <-
# st_difference(
# residential_parcels[50001:60000,],
# st_union(
# residential_parcels$building_geometry[50001:60000,])
# )
#
# available_yard7 <-
# st_difference(
# residential_parcels[60001:70000,],
# st_union(
# residential_parcels$building_geometry[60001:70000,])
# )
#
# available_yard8 <-
# st_difference(
# residential_parcels[70001:80000,],
# st_union(
# residential_parcels$building_geometry[70001:80000,])
# )
#
# available_yard9 <-
# st_difference(
# residential_parcels[80001:90000,],
# st_union(
# residential_parcels$building_geometry[80001:90000,])
# )
#
# available_yard10 <-
# st_difference(
# residential_parcels[90001:100000,],
# st_union(
# residential_parcels$building_geometry[90001:100000,])
# )
#
# available_yard10 <-
# st_difference(
# residential_parcels[100001:nrow(residential_parcels),],
# st_union(
# residential_parcels$building_geometry[100001:nrow(residential_parcels),])
# )
# all_available_yard <-
# available_yard %>%
# rbind(available_yard2) %>%
# rbind(available_yard3) %>%
# rbind(available_yard4) %>%
# rbind(available_yard5) %>%
# rbind(available_yard6) %>%
# rbind(available_yard7) %>%
# rbind(available_yard8) %>%
# rbind(available_yard9) %>%
# rbind(available_yard10)
#save(all_available_yard, file = "sj_all_yard_available.Rdata")
load("sj_all_yard_available.Rdata")
all_available_yard_other <-
all_available_yard %>%
filter(USE_CODE == 2 | USE_CODE == 3)
available_area_fun <- function(df, residential_parcels){
df %>%
as.data.frame() %>%
rename(geometry = parcel_geometry) %>%
left_join(residential_parcels %>% dplyr::select(APN), by = "APN") %>%
st_as_sf() %>%
st_set_geometry("geometry") %>%
as_Spatial() %>%
gBuffer(byid = T, width = 0) %>%
sp::disaggregate() %>%
st_as_sf() %>%
rename(available_area_geometry = geometry) %>%
st_set_geometry("available_area_geometry") %>%
mutate(
available_area = round(st_area(.) %>% as.numeric())
) %>%
st_set_crs(projection)
}
# save(available_area, file = "sj_available_backyard_area.Rdata")
#!!!!This .Rdata file constians all of the yard areas for parcels zoned as single-family residential!!!!
load("sj_available_backyard_area.Rdata")
nix_list <- c("69403028","41425018","67013004","67013003","67029024","74207011")
nix_df <-
available_area %>%
filter(APN %in% nix_list) %>%
left_join(
all_available_yard %>%
st_set_geometry(NULL) %>%
dplyr::select(APN, USE_CODE) %>%
filter(APN %in% nix_list),
by = "APN"
)
available_area1_other <- available_area_fun(all_available_yard_other[1:nrow(all_available_yard_other),], residential_parcels)
available_area1_other <-
available_area1_other %>%
dplyr::select(APN, available_area, available_area_geometry)
yard_area_multifamily <-
residential_parcels %>%
filter(USE_CODE %in% c(4,6,7)) %>%
dplyr::select(APN, parcel_geometry) %>%
mutate(
available_area = 0
) %>%
rename(available_area_geometry = parcel_geometry)
available_area_combo <-
available_area %>%
rbind(available_area1_other) %>%
rbind(yard_area_multifamily)
available_area_bg <-
available_area_combo %>%
left_join(
apn_bg_lookup %>%
filter(APN %in% available_area$APN),
by = "APN"
) %>%
filter(APN %notin% nix_list)
#
bg_area_avg <-
available_area_bg %>%
st_set_geometry(NULL) %>%
group_by(origin_census_block_group) %>%
summarise(yard_area_avg = mean(available_area))
#
extra_bg <-
sj_blockgroup %>%
filter(origin_census_block_group %notin% bg_area_avg$origin_census_block_group) %>%
st_set_geometry(NULL) %>%
dplyr::select(origin_census_block_group) %>%
mutate(yard_area_avg = 0)
#
bg_area_avg_all <-
bg_area_avg %>%
rbind(extra_bg) %>%
left_join(
sj_blockgroup %>%
dplyr::select(-DISTRICTS),
by = "origin_census_block_group"
) %>%
na.omit() %>%
st_as_sf() %>%
st_transform(4326)
#save(bg_area_avg_all, file = "sj_bg_avg_yard_area.Rdata")
load("sj_bg_avg_yard_area.Rdata")
The following map shows the void building footprint within the parcel shape for one block group. The resulting shape represents the yard area.
example_bg_yard <-
available_area_bg %>%
filter(origin_census_block_group == "060855006002")
mapview(example_bg_yard)
In order to align our geographic granularity with the other factors of our analysis, we take all of the yard sizes within a block group and calculate one single average yard value. The following map shows the average yard sizes for all of the block groups in San Jose.
#save(bg_area_avg_all_inflate, file = "sj_inflate_avg_yard.Rdata")
load("sj_inflate_avg_yard.Rdata")
blue_pal <- colorNumeric(
palette = colorRamp(c("#FFFFFF", "#00288c"), interpolate="spline"),
domain =
bg_area_avg_all_inflate %>%
pull(yard_area_avg) %>%
unique()
)
yard_bg_map <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>% st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = bg_area_avg_all_inflate,
fillColor = ~blue_pal(yard_area_avg),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
#group = "Average Daily Park Visits",
label = ~paste0("Block Group Average Yard Size: ",round(yard_area_avg), " square feet"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)) %>%
addLegend(
position = "topright",
pal = blue_pal,
values = bg_area_avg_all_inflate$yard_area_avg,
title = "Avg. Yard Size (sqft.)",
opacity = 1
)
yard_bg_map
Notice that the few dark blue parcels have extremely large average yard sizes (you can see the size by hovering over the shape on the map). This may due to zoning misclassifications. For example, the following parcel has a Property Use Code of 1 (single-family residential), but it is actually a water treatment plant. This “yard” (that has an area of 1.0067310^{6} square feet) was inflating the average residential yard size for the block group in which it is located.
mapview(nix_df[7,])
I did a visual inspection of the parcels in the block groups with large average yard sizes (over 30,000 sqft.), and removed the parcels that appeared to be misclassified. Note that there still may be other misclassified parcels (with smaller yard sizes) that are still being incorporated into the block group average yard size.
The following map excludes the parcels that I determined had a zoning misclassification. The coloring is no longer dominated by the extremely large yards.
bg_area_avg_minus <-
bg_area_avg_all %>%
filter(origin_census_block_group %notin% c("060855121001","060855119112","060855043081","69403028"))
blue_pal2 <- colorNumeric(
palette = colorRamp(c("#FFFFFF", "#00288c"), interpolate="spline"),
domain =
bg_area_avg_minus %>%
pull(yard_area_avg) %>%
unique()
)
yard_bg_map_minus <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>% st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = bg_area_avg_minus,
fillColor = ~blue_pal2(yard_area_avg),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
#group = "Average Daily Park Visits",
label = ~paste0("Block Group Average Yard Size: ",round(yard_area_avg), " square feet"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)) %>%
addLegend(
position = "topright",
pal = blue_pal2,
values = bg_area_avg_minus$yard_area_avg,
title = "Avg. Yard Size (sqft.)",
opacity = 1
)
yard_bg_map_minus
Some overall San Jose residential yard size statistics:
Mean San Jose Yard Size: 4861
Standard Deviation: 2.610^{4}
Minimum San Jose Yard Size: 0
Maximum San Jose Yard Size: 5.110^{5}
For now, I will but a hold on this analysis and move to the average number of people who visit San Jose parks that reside in these block groups. At the end, we will come back to these yard areas to compare against other accessibility factors.
When looking at park visitors who live in specific census block groups (CBGs), it is not sufficient to consider the CBG visit numbers at face value. The population of each block group plays an important factor in weighing a community’s visits to parks. For example, if a CBG has 3 residents, and they have an average of 3 daily park visits, then we can infer that this community has very active park visitors (even though the 3 park visits may seem low).
The following map displays all of the census block groups we considered in this analysis (see the Safegraph Data Collection Explanation section above for why some CBGs are missing). The “Park Visits” option illustrates the average number of visits that each CBG has had to a SJ park since March 2, 2020. The “Park Visits per Population” displays a ratio, which provides more insight into how actively a community visits parks (a higher ratio = more active, a lower ratio = less active).
####periodically update from sanjose_park_process file
park_origins <- readRDS("sj_park_origins.rds") %>%
filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group)
park_geometry <-
readRDS('park_output_geometry.rds') %>%
st_as_sf() %>%
st_transform(projection)
sj_population <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = c("group(B01003)")
) %>%
mutate(
origin_census_block_group =
paste0(state,county,tract,block_group)
) %>%
rename(
total_pop = "B01003_001E"
) %>%
dplyr::select(total_pop, origin_census_block_group)
park_origin_mean <-
park_origins %>%
group_by(origin_census_block_group) %>%
summarise(mean_visits = mean(mean_origin_visits))
visit_pop_combo <-
sj_population %>%
filter(origin_census_block_group %in% park_origin_mean$origin_census_block_group) %>%
left_join(
park_origin_mean,
by = "origin_census_block_group"
) %>%
na.omit() %>%
mutate(
visits_per_pop = round(mean_visits/total_pop,3)
) %>%
left_join(
sj_blockgroup %>%
dplyr::select(-DISTRICTS),
by = "origin_census_block_group"
) %>%
st_as_sf() %>%
st_transform(projection)
orange_pal <- colorNumeric(
palette = colorRamp(c("#fcd786", "#a67100"), interpolate="spline"),
domain =
visit_pop_combo %>%
pull(mean_visits) %>%
unique()
)
teal_pal <- colorNumeric(
palette = colorRamp(c("#d6fffe", "#015a68"), interpolate="spline"),
domain =
visit_pop_combo %>%
pull(visits_per_pop) %>%
unique()
)
all_visitors_map <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>% st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = visit_pop_combo %>%
st_transform(4326),
fillColor = ~orange_pal(mean_visits),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "Average Daily Park Visits",
label = ~paste0(round(mean_visits)," average daily visits to SMC parks"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)) %>%
addPolygons(
data = visit_pop_combo %>%
st_transform(4326),
fillColor = ~teal_pal(visits_per_pop),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "Average Park Visits per Population",
label = ~paste0(visits_per_pop," average daily park visits per population"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addLayersControl(
baseGroups = c("Average Daily Park Visits", "Average Park Visits per Population"),
options = layersControlOptions(collapsed = FALSE)
)
all_visitors_map
Some interesting results I would like to highlight:
The block group that has both the highest average daily park visits and highest visit/population ratio is shown in the following view. Interestingly, this CBG contains the Santa Clara County jail. Approximately 85.8% of the residents of this block group visit a San Jose park everyday. Perhaps the inmates perform community service frequently at San Jose parks.
all_visitors_map %>%
setView(-121.906408, 37.350420, zoom = 15)
An additional block group that stands out is one located near Emma Prusch Farm Park. This CBG contains mainly an on-ramp, a portion of Bayshore Freeway, and a storage center, but no residential area. In the “Average Daily Park Visits” view, this CBG does not stand out. But when you switch to the “Average Park Visits per Population” view, the CBG shows to have the second highest visits/population ratio in all of San Jose (although the daily park visit average is only 8, 53% of those people visit parks everyday). This may be an instance of a homeless encampment underneath or near the freeway on-ramp, whose residents visit the nearby Emma Prusch Farm Park quite frequently.
all_visitors_map %>%
setView(-121.847157, 37.335857, zoom = 15)
What communities visit parks more or less frequently than others? We will now directly compare the CBGs with the lowest and highest park visits per population ratios since March 2, 2020.
We considered a low visits per population ratio to be less than 0.02, and a high ratio to be more than 0.127 (one SD less/more than the mean, respectively).
lowest_visitors <-
visit_pop_combo %>%
filter(visits_per_pop < mean(visit_pop_combo$visits_per_pop)-sd(visit_pop_combo$visits_per_pop)) %>%
st_as_sf() %>%
st_transform(projection)
purple_pal <- colorNumeric(
palette = colorRamp(c("#b27ef2", "#4900a3"), interpolate="spline"),
domain =
lowest_visitors %>%
pull(visits_per_pop) %>%
unique()
)
highest_visitors <-
visit_pop_combo %>%
filter(visits_per_pop > mean(visit_pop_combo$visits_per_pop)+sd(visit_pop_combo$visits_per_pop)) %>%
st_as_sf() %>%
st_transform(projection)
red_pal <- colorNumeric(
palette = colorRamp(c("#ff8585", "#9c0000"), interpolate="spline"),
domain =
highest_visitors %>%
pull(visits_per_pop) %>%
unique()
)
extremes_visitors_map <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>%
st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = lowest_visitors %>%
st_transform(4326),
fillColor = ~purple_pal(visits_per_pop),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "CBGs with Lowest Park Visit Ratio",
label = ~paste0(visits_per_pop," park visits per population"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addPolygons(
data = highest_visitors %>%
st_transform(4326),
fillColor = ~red_pal(visits_per_pop),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "CBGs with Highest Park Visit Ratio",
label = ~paste0(visits_per_pop," park visits per population"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addLayersControl(
overlayGroups = c("CBGs with Lowest Park Visit Ratio", "CBGs with Highest Park Visit Ratio"),
options = layersControlOptions(collapsed = FALSE)
)
extremes_visitors_map
#mapview(lowest_visitors, zcol = "visits_per_pop")
In the geographic sense, there appears to be no specific CBG pattern that would allow for predictive classification of a low or high visit/population ratio.
Analysis still yet to come
The following scatter plot represents a linear regression between the average residential yard size and average park visits per population for each block group in San Jose. There is a negative correlation between these two factors, representing that in general, people who have larger yards tend to visit parks less. See the linear model summary after the plot for the statistical results.
yard_pop_tied <-
visit_pop_combo %>%
left_join(
bg_area_avg_all %>%
st_set_geometry(NULL) %>%
dplyr::select(yard_area_avg, origin_census_block_group),
by = "origin_census_block_group"
) %>%
filter(yard_area_avg < 30000)
lim_mod <- lm(yard_pop_tied$visits_per_pop ~ yard_pop_tied$yard_area_avg)
lim_scatter <-
yard_pop_tied %>%
ggplot(
aes(
x = yard_area_avg,
y = visits_per_pop
)
) +
geom_point() +
geom_smooth(method=lm) +
labs(
x = "Average CBG Residential Yard Area (sqft.)",
y = "Visits per CBG Population",
title = "SMC Park Accessibility: Parks Visits per Population vs. Residential Yard Area"
)
lim_scatter
summary(lim_mod)
##
## Call:
## lm(formula = yard_pop_tied$visits_per_pop ~ yard_pop_tied$yard_area_avg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.05886 -0.02686 -0.01158 0.01370 0.78014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.786e-02 3.050e-03 25.529 <2e-16 ***
## yard_pop_tied$yard_area_avg -1.305e-06 6.559e-07 -1.989 0.0472 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05327 on 564 degrees of freedom
## Multiple R-squared: 0.006965, Adjusted R-squared: 0.005204
## F-statistic: 3.956 on 1 and 564 DF, p-value: 0.04719